{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "TR7-OiibmzJ7",
    "outputId": "ddbc36b9-214e-411a-fef6-a7520b8e7f1e"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import numpy as np\n",
    "import keras\n",
    "\n",
    "import os\n",
    "import json\n",
    "\n",
    "INPUT_DIR = './'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "K0r4Fcj8M_--"
   },
   "source": [
    "# 数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "qIDMPGEvmzKC",
    "outputId": "f8ba9e72-62b5-4bf8-a66b-4f885f5af1fc"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据总数：20001, 欺凌数据个数：7822, 友好数据个数：12179\n"
     ]
    }
   ],
   "source": [
    "f = open(INPUT_DIR+'Dataset for Detection of Cyber-Trolls.json')\n",
    "TrollsData = [] # Label 为 1\n",
    "NonTrollsData = [] # Label 为 0\n",
    "for i in f:\n",
    "    temp = json.loads(i)\n",
    "    content = temp['content']\n",
    "    label = int(temp['annotation']['label'][0])\n",
    "    Data = {\n",
    "        \"content\":content,\n",
    "        \"annotation\":label\n",
    "    }\n",
    "    if label == 0:\n",
    "        NonTrollsData.append(Data)\n",
    "    else:\n",
    "        TrollsData.append(Data)\n",
    "f.close()\n",
    "print('数据总数：%d, 欺凌数据个数：%d, 友好数据个数：%d' % (len(TrollsData)+len(NonTrollsData),len(TrollsData),len(NonTrollsData))) \n",
    "#print(json.dumps(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "mVEGOXh7mzKF"
   },
   "source": [
    "可以看到数据不是很均衡,我们对欺凌数据进行重复采样。\n",
    "采用复制加随机拼接的方法再生成出一份样本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "G2p-5bbImzKG",
    "outputId": "dc33b897-9d40-4990-8521-385415482a6a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "现在欺凌数据个数:15644\n"
     ]
    }
   ],
   "source": [
    "import random \n",
    "import copy\n",
    "\n",
    "new_temp_TrollsData =  copy.deepcopy(TrollsData)\n",
    "random.shuffle(new_temp_TrollsData)\n",
    "\n",
    "for i in range(len(new_temp_TrollsData)):\n",
    "    content = TrollsData[i]['content'].split(' ')\n",
    "    content_len = len(content)\n",
    "    r = random.randint(0,content_len - 1)\n",
    "    new_temp_TrollsData[i]['content'] += ' ' + content[r]\n",
    "    r = random.randint(0,content_len - 1)\n",
    "    new_temp_TrollsData[i]['content'] += ' ' + content[r]\n",
    "\n",
    "TrollsData += new_temp_TrollsData\n",
    "print('现在欺凌数据个数:%d'%len(TrollsData))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "A_JogsnmmzKJ"
   },
   "source": [
    "欺凌个数又超过友好数据的个数了，我们剪裁到和友好数据一样的大小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "bEFRbPBrmzKK",
    "outputId": "ae6676a2-5bbf-4e96-893c-de03c0875e3b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据总数：24358, 欺凌数据个数：12179, 友好数据个数：12179\n"
     ]
    }
   ],
   "source": [
    "TrollsData = TrollsData[:12179]\n",
    "print('数据总数：%d, 欺凌数据个数：%d, 友好数据个数：%d' % (len(TrollsData)+len(NonTrollsData),len(TrollsData),len(NonTrollsData))) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "-GXQQfQqmzKM"
   },
   "source": [
    "把数据整合到一起，然后，打乱它"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 54
    },
    "colab_type": "code",
    "id": "Qg-jao0dmzKN",
    "outputId": "1ed0a54e-83e9-4f75-8134-beb0465f2e79"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'content': 'lmfaoooooo. JEALOUS bitches make me sick!(; so nicoo  ahaah what th fxck is up! :D', 'annotation': 0}, {'content': 'is officially too gay to function! be owners', 'annotation': 1}, {'content': 'the nicest thing someone has ever done for would probaly be  when sam my sister just told me for no reason that i was a really good person and stuff', 'annotation': 0}, {'content': '\" Fuck Sarah Palin  the political version on Sanjaya\".....perfection.', 'annotation': 0}] 24358\n"
     ]
    }
   ],
   "source": [
    "Data = TrollsData + NonTrollsData\n",
    "random.shuffle(Data)\n",
    "print(Data[:4],len(Data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "d18Sa_IjmzKQ"
   },
   "outputs": [],
   "source": [
    "random.shuffle(Data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "4-TgnNhRmzKS"
   },
   "source": [
    "接下来创建词表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "ShCZ0WTZmzKT"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "WordFre = {} #词频\n",
    "for d in Data:\n",
    "    content = keras.preprocessing.text.text_to_word_sequence(d['content'])\n",
    "    for c in content:\n",
    "        if c == '':\n",
    "            continue\n",
    "        word = c.lower()\n",
    "        if WordFre.get(word,None) == None:\n",
    "            WordFre[word] = 0\n",
    "        WordFre[word] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Tor28LsPmzKW"
   },
   "outputs": [],
   "source": [
    "threshold = 10\n",
    "WordIdx = {}\n",
    "indx = 0\n",
    "for w in sorted(WordFre.items(),key=lambda x:x[1],reverse=True):\n",
    "    key = w[0]\n",
    "    fre = w[1]\n",
    "    if fre < threshold:\n",
    "        continue\n",
    "    indx += 1\n",
    "    WordIdx[key] = indx\n",
    "WordIdx['_PAD_'] = indx+1,\n",
    "WordIdx['_UNK_'] = indx+2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "TfBD1S3lmzKa"
   },
   "outputs": [],
   "source": [
    "WordIdxLen = len(WordIdx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "6fDzc_vvmzKd",
    "outputId": "1e3624f4-c794-4203-fda3-e0ad0f56185e"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2631"
      ]
     },
     "execution_count": 9,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "WordIdx['_UNK_']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "OAKSDpxMnPG6"
   },
   "outputs": [],
   "source": [
    "json.dump(WordIdx,open('word.json','w'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "YepwcKzPmzKg"
   },
   "source": [
    "# 开始构建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 374
    },
    "colab_type": "code",
    "id": "euZmauR3mzKh",
    "outputId": "52ba7a39-fc08-4f60-889f-c5ea16321cc8"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, None, 50)          131600    \n",
      "_________________________________________________________________\n",
      "bidirectional_1 (Bidirection (None, 128)               44160     \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 32)                4128      \n",
      "_________________________________________________________________\n",
      "dropout_1 (Dropout)          (None, 32)                0         \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 16)                528       \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 1)                 17        \n",
      "=================================================================\n",
      "Total params: 180,433\n",
      "Trainable params: 180,433\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "(None, None)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(2, 1)"
      ]
     },
     "execution_count": 25,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation,Embedding,Bidirectional,GRU,Flatten,Dropout\n",
    "from keras import regularizers\n",
    "\n",
    "model = keras.models.Sequential()\n",
    "model.add(Embedding(input_dim=WordIdxLen+1,output_dim=50,mask_zero=True))\n",
    "model.add(Bidirectional(GRU(64,return_sequences=False,dropout=0.4)))\n",
    "model.add(Dense(units=32,activation='relu',kernel_regularizer=regularizers.l2(0.01)))\n",
    "model.add(Dropout(0.2, noise_shape=None, seed=None))\n",
    "model.add(Dense(units=16,activation='relu',kernel_regularizer=regularizers.l2(0.01)))\n",
    "model.add(Dense(units=1,activation='sigmoid'))\n",
    "model.compile(\n",
    "    optimizer='adam',\n",
    "    loss='binary_crossentropy',\n",
    "    metrics=['binary_accuracy'],\n",
    ")\n",
    "model.summary()\n",
    "\n",
    "print(model.input_shape) #查看模型输入shape\n",
    "model.predict(np.asarray([\n",
    "    [1,2,3,4,5,6,7],\n",
    "    [2,3,4,5,6,7,8]\n",
    "])).shape #查看模型输出的shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "lR7Gvdn_mzKk"
   },
   "source": [
    "# 开始fit数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "AQ8VYicqmzKk"
   },
   "outputs": [],
   "source": [
    "#获取最大字符串长度，且对每一个content进行text_to_word_sequence\n",
    "max_str_len = 0\n",
    "for d in Data:\n",
    "    d['content'] = keras.preprocessing.text.text_to_word_sequence(d['content'])\n",
    "    if len(d['content']) > max_str_len :\n",
    "        max_str_len = len(d['content'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "tEMKY9EzmzKn",
    "outputId": "78d4f132-31d3-4499-e4f4-d9eced078c22"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[27, 79, 10, 8, 6, 2, 29]"
      ]
     },
     "execution_count": 20,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[WordIdx.get(i,WordIdx['_UNK_']) for i in Data[10]['content']] #查看一下转换的结果例子"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "mKMwNqNvmzKp"
   },
   "outputs": [],
   "source": [
    "#开始填充字符串\n",
    "TrainData = []\n",
    "TrainLabel = []\n",
    "for d in Data:\n",
    "    vec = [WordIdx.get(i,WordIdx['_UNK_']) for i in d['content']]\n",
    "    pad = WordIdx.get('_PAD_')\n",
    "    temp = keras.preprocessing.sequence.pad_sequences(sequences=[vec],maxlen=max_str_len,value=pad)\n",
    "    TrainData.append(temp[0])\n",
    "    TrainLabel.append(d['annotation'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "c0PTJn9jmzKr"
   },
   "outputs": [],
   "source": [
    "TrainData[:1],TrainLabel[:10] #查看填充结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "HNse8ZSGmzKw"
   },
   "outputs": [],
   "source": [
    "#分割训练和测试集\n",
    "SPILT = 0.9\n",
    "trainNum = int(len(TrainData)*SPILT)\n",
    "finalTrainData = TrainData[0:trainNum]\n",
    "finalTrainLable = TrainLabel[0:trainNum]\n",
    "finalTestData = TrainData[trainNum:]\n",
    "finalTestLabel = TrainLabel[trainNum:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "TqF4NsXGmzKz"
   },
   "outputs": [],
   "source": [
    "#模型训练\n",
    "model.fit(np.asarray(finalTrainData),np.asarray(finalTrainLable),batch_size=500, epochs=100,validation_split=0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "j8tpWA9HmzK2"
   },
   "outputs": [],
   "source": [
    "model.save('model.h5') #训练结束，保存模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "4tWetb5RNH_t"
   },
   "source": [
    "# 模型测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "AJJni3iKmzK5"
   },
   "outputs": [],
   "source": [
    "model.evaluate(np.asarray(finalTestData),np.asarray(finalTestLabel)) #评估模型效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 51
    },
    "colab_type": "code",
    "id": "9B-PpbVSmzK7",
    "outputId": "9179041d-572e-4ba5-a145-caec345e111c"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 65, 2]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[0.03189052]], dtype=float32)"
      ]
     },
     "execution_count": 13,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#自定义的句子测试\n",
    "WordIdx = json.load(open('word.json'))\n",
    "s = '''I love you'''\n",
    "s_q = keras.preprocessing.text.text_to_word_sequence(s)\n",
    "s_v = [WordIdx.get(i,WordIdx['_UNK_']) for i in s_q]\n",
    "print(s_v)\n",
    "model.predict(np.asarray([s_v]))"
   ]
  }
 ],
 "metadata": {
  "accelerator": "TPU",
  "colab": {
   "collapsed_sections": [],
   "name": "TrollsClassifier.ipynb",
   "provenance": [],
   "toc_visible": true,
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
